knitr::opts_chunk$set(echo = TRUE)
invisible(lapply(
c("dplyr", "data.table", "matrixStats", "ggplot2", "reshape2",
"ggrepel", "pROC", "stringr", "tibble", "viridis", "ggridges"),
function(pkg) suppressPackageStartupMessages(library(pkg, character.only = TRUE))
))
##########################################################################################
## LSHTM: Maria’s data that she used for the hvCpG paper can be accessed from ing-p5 here:
# /mnt/old_user_accounts/p3/maria/PhD/Data/datasets/
#
# Her hvCpG scripts are here:
# /mnt/old_user_accounts/p3/maria/PhD/Projects/hvCpGs/Scripts/
## Load results of Maria for hvCpG
Marias_hvCpGs <- read.csv("~/2024_hvCpG/03_prepDatasetsMaria/Derakhshan2022_ST5_hvCpG.txt")
## Recreate Maria's datasets
folder1 <- normalizePath("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/BMIQ + 10 PCs + age + sex OUTLIERS REMOVED/")
rds_files1 <- list.files(path = folder1, pattern = "\\.RDS$", full.names = TRUE)
# Read all .rds files into a list and convert each to a matrix
rds_list_mat1 <- lapply(rds_files1, function(file) {as.matrix(readRDS(file))})
## Name the list elements by file names (without extensions)
names(rds_list_mat1) <- gsub("\\.RDS$", "", basename(rds_files1))
rds_list_mat2 <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/TCGA/TCGA_BMIQ_age_sex_PC_adjusted_OUTLIERS_REMOVED_round2.RDS")
rds_list_mat2 <- lapply(rds_list_mat2, as.matrix)
rds_list_mat <- c(rds_list_mat1, rds_list_mat2) ; rm(rds_list_mat1, rds_list_mat2)
# Keep only the array background (covered in 15 datasets out of 30)
all_cpgs <- unlist(lapply(rds_list_mat, rownames))
cpg_counts <- table(all_cpgs)
common_cpgs <- names(cpg_counts[cpg_counts >= 15])
rds_list_mat <- lapply(rds_list_mat, function(mat) {
mat[rownames(mat) %in% common_cpgs, ]
})
## Load mQTL-matched controls
cistrans_GoDMC_hvCpG_matched_control <-
read.table("~/2024_hvCpG/03_prepDatasetsMaria/cistrans_GoDMC_hvCpG_matched_control.txt", header = T)
## Load cpgnames (all cpg covered after filtration)
cpg_names_all <- rhdf5::h5read("/home/alice/arraysh5/all_matrix_noscale.h5", "cpg_names")
## Check if I'm consistent
table(cpg_names_all %in% common_cpgs) ## PERFECT
##
## TRUE
## 394240
sub_cistrans_GoDMC_hvCpG_matched_control <- cistrans_GoDMC_hvCpG_matched_control[
cistrans_GoDMC_hvCpG_matched_control$hvCpG_name %in% cpg_names_all &
cistrans_GoDMC_hvCpG_matched_control$controlCpG_name %in% cpg_names_all,]
## Histogram of samples in array data
df <- data.frame(n_samples = sapply(rds_list_mat, ncol))
# Plot with ggplot2
ggplot(df, aes(x = n_samples)) +
geom_histogram(bins = 100, fill = "steelblue", color = "white") +
theme_minimal(base_size = 14) +
labs(
title = "Distribution of number of samples per dataset",
x = "Number of samples",
y = "Count of datasets"
)
rm(all_cpgs, cpg_counts,folder1, rds_files1, cistrans_GoDMC_hvCpG_matched_control)
# combine everything into one vector
# all_values <- unlist(rds_list_mat, use.names = FALSE)
#
# # plot one histogram for all values
# hist(all_values, breaks = 100,
# main = "Histogram of all values in rds_list_mat",
# xlab = "Values")
# rm(all_values)
Maria’s paper: We defined an hvCpG in the following way:
detecthvCpGCutoff <- function(rds_list_mat){
## Within each dataset, calculate the CpGs variance, and keep the top 5%
top5pcvar <- lapply(rds_list_mat, function(mat) {
# Step 1: Calculate row variances
row_variances = rowVars(mat, na.rm = T)
df = data.frame(var=row_variances, cpg=names(row_variances))
# Step 2
top = top_n(df, as.integer(0.05*nrow(df)), var) ## slightly different than quantile cause keep ties
return(top)
})
## CpGs in the top 5% variance:
cpg_counts_top <- table(unlist(lapply(top5pcvar, rownames))) %>%
data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs_top5pc"="Freq")
## all covered CpGs:
cpg_counts <- table(unlist(lapply(rds_list_mat, rownames))) %>%
data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs"="Freq")
## Both:
cpg_counts_full <- merge(cpg_counts, cpg_counts_top, all.x = T)
rm(cpg_counts, cpg_counts_top)
## in 65% of datasets in which the CpG is covered (following quality control), it has methylation variance in the top 5% of all (non-removed) CpGs:
## rounding, to mimick Maria's approach
hvCpGs_repro_maria <- na.omit(cpg_counts_full[round(cpg_counts_full$all_cpgs_top5pc/cpg_counts_full$all_cpgs, 2) >= 0.65,])
return(hvCpGs_repro_maria)
}
hvCpGs_repro_maria <- detecthvCpGCutoff(rds_list_mat = rds_list_mat)
Maria detected 4143 hvCpGs. I detected 4377 hvCpGs. We both have 4143 hvCpGs in common.
set.seed(123) # for reproducibility
# Keep 3 random columns per element
rds_list_mat_3ind <- lapply(rds_list_mat, function(df) {
cols <- sample(ncol(df), 3) # choose 3 random columns
df[, cols, drop = FALSE] # subset while keeping data.frame
})
hvCpGs_repro_maria_3ind <- detecthvCpGCutoff(rds_list_mat = rds_list_mat_3ind)
## Plot
# install.packages("ggVennDiagram")
library(ggVennDiagram)
# List of items
x <- list("Full array" = hvCpGs_repro_maria$cpgs,
"Array 3ind/ds" = hvCpGs_repro_maria_3ind$cpg)
p <- ggVennDiagram(x, label_alpha = 0) + scale_fill_gradient(low = "#F4FAFE", high = "#4981BF")
p
# 2D Venn diagram
ggsave("../../figures/arrayCutoffLowPower3ind.pdf", plot = p, width = 5, height = 5)
The proportion of CpGs than Maria found hvCpGs is p(hvCpG) = 1.02%.
Within each dataset k, calculate the median sd of all CpG j
all_sd_jk <- sapply(rds_list_mat, function(k){
get_sd_k <- function(k){
## Calculate a vector of the row (=per CpG j) sd
return(rowSds(k, na.rm = T))
}
## Return a vector of sds, in a list for each dataset
return(get_sd_k(k))
})
## Plot:
df_long <- reshape2::melt(all_sd_jk, variable.name = "Vector", value.name = "SDs")
## Plot distributions of all vectors on the same graph
ggplot(df_long, aes(x = SDs, color = L1)) +
geom_density() +
labs(title = "Distribution of SDs accross datasets",
x = "SDs",
y = "Density") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_sqrt()
## Calculate lambda per dataset
lambdas = sapply(all_sd_jk, function(x){quantile(x, 0.95, na.rm=T)/median(x, na.rm=T)})
names(lambdas) <- gsub(".95%", "", names(lambdas))
# Histogram with kernel density
ggplot(data.frame(lambda=lambdas, dataset=names(lambdas)),
aes(x = lambdas)) +
geom_histogram(aes(y = after_stat(density)),
colour = 1, fill = "white", binwidth = .1) +
geom_density(lwd = 1, colour = 4,
fill = 4, alpha = 0.25)+
theme_minimal() +
geom_vline(xintercept = median(lambdas), col = "red")
median(lambdas)
## [1] 4.192186
Which datasets have a higher lambda, and why?
df = data.frame(nind=sapply(rds_list_mat, ncol),
dataset=names(rds_list_mat),
lambda=lambdas,
tissue = sapply(strsplit(names(rds_list_mat), "_"), `[`, 1),
ethnicity=sapply(strsplit(names(rds_list_mat), "_"), `[`, 2)) %>%
mutate(dataset=ifelse(dataset %in% c("Blood_Cauc", "Blood_Hisp"), "Blood_Cauc_Hisp", dataset)) %>%
mutate(dataset=ifelse(dataset %in% c("Blood_Mexican", "Blood_PuertoRican "), "Blood_Mex_PuertoRican ", dataset)) %>%
mutate(dataset=ifelse(dataset %in% c("CD4+_Estonian", "CD8+_Estonian"), "CD4+_CD8+_Estonian", dataset)) %>%
mutate(dataset=ifelse(dataset %in% c("Saliva_Hisp", "Saliva_Cauc"), "Saliva_Hisp_Cauc", dataset))
ggplot(data = df,
aes(x=lambda, y=nind))+
geom_smooth(method = "lm")+
geom_point()+
geom_label_repel(aes(label = dataset, fill=dataset), size= 2, alpha=.8, max.overlaps = 25)+
theme_bw()+theme(legend.position = "none")+
scale_x_log10()
## `geom_smooth()` using formula = 'y ~ x'
## Emphasize the outliers
df_long$col = "grey"
df_long$col[df_long$L1 %in% "uterus"] <- "red"
ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
geom_density(data = df_long[!df_long$L1 %in% "uterus",], alpha = .5) +
geom_density(data = df_long[df_long$L1 %in% "uterus",], alpha = .6) +
scale_fill_manual(values = c("grey", "red"))+
labs(title = "Distribution of SDs accross datasets",subtitle = "Red=uterus",
x = "SDs",
y = "Density") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_sqrt()
The equation is:
\[ \log\left(P(M_j)\right) = \sum_{i=1}^{n} \log\left( \sum_{Z_j=0}^{1} \left( \sum_{Z_{j,k}=0}^{1} P(M_{i,j} \mid Z_{j,k}) \times P(Z_{j,k} \mid Z_j) \right) \times P(Z_j) \right) \]
The transformation lead to weird values:
test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/Raw_cleaned_beta_matrices_GEO/Blood_Hisp")
test[rownames(test) %in% "cg23089912",5:10]
## GSM1870975 GSM1870977 GSM1870979 GSM1870986 GSM1870989 GSM1870992
## cg23089912 0.8486523 0.8883389 0.8610275 0 0.8572614 0.8153023
test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/BMIQ + 10 PCs + age + sex OUTLIERS REMOVED/Blood_Hisp.RDS")
test[rownames(test) %in% "cg23089912",5:10]
## GSM1870975 GSM1870977 GSM1870979 GSM1870986 GSM1870989
## cg23089912 0.001379438 0.002837506 0.0086327 1.124546e-17 0.002056392
## GSM1870992
## cg23089912 0.0001942933
# GSM1870986
# 0.00000000000000001124546 --> give extreme value in scaling, and p dnorm=0
The zero was weirdly transformed. Is that expected?
cpgs_of_interest <- c("cg27470047", "cg16958594", "cg01992382", "cg19313311")
library(matrixStats)
# Step 1: compute SDs per CpG per dataset
sds_per_dataset <- lapply(rds_list_mat, function(x) {
subset_mat <- x[rownames(x) %in% cpgs_of_interest, , drop = FALSE]
if (nrow(subset_mat) == 0) {
return(NULL)
}
rowSds(as.matrix(subset_mat), na.rm = TRUE) |>
setNames(rownames(subset_mat))
})
# Step 2: bind results into a data.frame
sds_df <- do.call(cbind, sds_per_dataset)
## Warning in (function (..., deparse.level = 1) : number of rows of result is not
## a multiple of vector length (arg 7)
# Step 3: median SD per CpG across datasets
median_sd_per_cpg <- apply(sds_df, 1, median, na.rm = TRUE)
median_sd_per_cpg = melt(median_sd_per_cpg) %>% rownames_to_column("CpG")
sds_df = melt(sds_df)
names(sds_df) = c("CpG", "dataset", "sd")
# Step 4: histogram of SD distributions per group
ggplot(sds_df, aes(x = sd)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
facet_wrap(~ CpG, scales = "free_y") +
theme_minimal() +
labs(title = "Distribution of SD per CpG",
x = "Standard deviation across samples",
y = "Count")+
geom_vline(data = median_sd_per_cpg, aes(xintercept = value))
\[ p0 = P(Z_{j,k}=0 \mid Z_j=0) \\ p1 = P(Z_{j,k}=1 \mid Z_j=1) \]
p1 = 0.65 in Maria’s approach (to be a hvCpG, it needs to be hypervariable in >65% of the datasets)
## Load pos2check that was used in the grid run
load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_allsamples_30datasets_6906CpGs_0_8p0_0_65p1.RData")
cpgNamesgridrun <- rownames(results_MariasarraysREDUCED_allsamples_30datasets_6906CpGs_0_8p0_0_65p1)
# Step 1: Read results and compute AUC + threshold
files <- list.files(
"../../resultsDir/Arrays/hvCpGandControls",
pattern = "^results_Maria_6906CpGs_.*\\.RData$",
full.names = TRUE
)
auc_df <- tibble(p0=double(), p1=double(), AUC=double(), threshold=double())
roc_list <- list()
for (f in files) {
if (file.info(f)$size == 0) next
nm <- load(f)
res <- get(nm)
pstr <- str_match(f, "_(\\d+(?:_\\d+)?)p0_(\\d+(?:_\\d+)?)p1")[, 2:3]
p0 <- as.numeric(gsub("_", ".", pstr[1]))
p1 <- as.numeric(gsub("_", ".", pstr[2]))
label <- sprintf("p0=%.2f p1=%.2f", p0, p1)
df <- as.data.frame(res) %>% rownames_to_column("CpGpos") %>%
mutate(CpG=cpgNamesgridrun)
colnames(df)[2] <- "alpha"
truth <- ifelse(df$CpG %in% Marias_hvCpGs$CpG, 1, 0)
roc_obj <- try(roc(truth, df$alpha, quiet=TRUE), silent=TRUE)
if (inherits(roc_obj, "try-error")) next
aucv <- as.numeric(auc(roc_obj))
thr <- coords(roc_obj, "best", ret = "threshold", transpose = FALSE)[["threshold"]]
auc_df <- auc_df %>% add_row(p0, p1, AUC=aucv, threshold=thr)
pts <- coords(roc_obj, "all", ret=c("sensitivity","specificity"),
transpose = FALSE) %>%
as_tibble() %>%
mutate(FPR = 1 - specificity, TPR = sensitivity, label=label)
roc_list[[label]] <- pts
}
roc_all <- bind_rows(roc_list)
# Step 2: Plot ROC curves
roc_plot <- ggplot(roc_all, aes(FPR, TPR, group=label, color=label)) +
geom_line(linewidth=0.8, alpha=0.6) +
geom_abline(slope=1, linetype="dashed", color="gray50") +
labs(title="ROC Curves for Different (p0,p1)",
x="False Positive Rate", y="True Positive Rate") +
theme_minimal() + theme(legend.position="none")
# Step 3: AUC heatmap with thresholds
### To avoid multiple
auc_df <- auc_df %>% distinct(p0, p1, .keep_all = TRUE)
best <- auc_df |>
filter(AUC == max(AUC, na.rm = TRUE)) |>
slice(1) # in case of ties
heatmap_plot <- ggplot(auc_df, aes(p0, p1, fill=AUC)) +
geom_tile(color="white") +
# geom_text(aes(label=round(threshold,2)),
# size=3, color="black", check_overlap = TRUE) +
# Outline best tile
geom_rect(xmin = 0.8-0.025, xmax = 0.8+0.025, ymin = 0.65-0.025, ymax = 0.65+0.025,
fill = NA, color = "red", linewidth = 1) +
scale_fill_viridis_c(option = "plasma") +
labs(title="AUC & Optimal Threshold per (p0,p1)",
x="p0", y="p1") +
theme_minimal(base_size = 14) + theme(plot.title = element_blank())
# Step 4: Show both plots
print(roc_plot)
print(heatmap_plot)
auc_df[auc_df$p0 %in% 0.8 & auc_df$p1 %in% 0.65,]
## # A tibble: 1 × 4
## p0 p1 AUC threshold
## <dbl> <dbl> <dbl> <dbl>
## 1 0.8 0.65 0.986 0.735
The best threshold ploted in tiles is the alpha probability that gives the best sensitivity-specificity trade-off. It’s derived using a statistical rule (Youden’s J: maximizes (Sensitivity + Specificity – 1), giving the best overall balance between true positives and true negatives). It classifies a CpG as hypervariable in Maria’s approach if alpha > threshold
Very big discrepancies between Maria’s approach and mine for high p1 with low p0.
load("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_Maria_6906CpGs_0_8p0_0_65p1.RData")
df1 = results_Maria_pos2check_0_8p0_0_65p1
myplots <- function(df2, N){
plot1 = data.frame(df1, df2) %>%
tibble::rownames_to_column("CpGpos") %>%
mutate(CpGname = cpg_names_all[as.numeric(CpGpos)]) %>%
mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
dplyr::rename(alpha_full = alpha, alpha_reduce = alpha.1) %>%
ggplot(aes(x = alpha_full, y = alpha_reduce)) +
geom_point(aes(fill=ishvCpG), pch =21, size =3, alpha = .3) +
geom_abline(slope = 1, linetype = "dashed") + theme_bw()
# === 1) Prepare the ORIGINAL ===
df_original = df1 %>%
data.frame() %>%
tibble::rownames_to_column("CpGpos") %>%
mutate(CpGname = cpg_names_all[as.numeric(CpGpos)]) %>%
mutate(ishvCpG = ifelse(
CpGname %in% Marias_hvCpGs$CpG,
"1500 hvCpG in Maria's study",
"1500 mQTL matching controls"
),
source = "original") %>%
melt()
# === 2) Prepare the MIMIC ===
df_mimic = df2 %>%
data.frame() %>%
tibble::rownames_to_column("CpGname") %>%
mutate(ishvCpG = ifelse(
CpGname %in% Marias_hvCpGs$CpG,
"1500 hvCpG in Maria's study",
"1500 mQTL matching controls"
),
source = "mimicAtlas") %>%
melt()
# === 3) Combine ===
df_combined = bind_rows(df_original, df_mimic)
# === 4) Plot ===
plot2 = ggplot(df_combined, aes(x = value, y = ishvCpG, fill = ..x..)) +
geom_density_ridges_gradient(scale = 3, rel_min_height = .01) +
facet_wrap(~source, ncol = 1) +
scale_fill_viridis(name = "alpha", option = "C") +
theme_bw() +
theme(axis.title.y = element_blank()) +
labs(title = "Probability alpha of being hypervariable in my algorithm",
subtitle = paste0("Original array data vs reduced (3 samples in ", N, " datasets)"))
print(plot1)
print(plot2)
}
for (i in c(3,20)){
load(paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData"))
myplots(df2 = get(paste0("results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1")), N=i)
}
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 0.0435
## Picking joint bandwidth of 0.0891
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Warning: Removed 767 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Picking joint bandwidth of 0.0308
## Picking joint bandwidth of 0.0891
for (i in c(3,20)){
load(paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_4samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData"))
myplots(df2 = get(paste0("results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1")), N=i)
}
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Picking joint bandwidth of 0.0435
## Picking joint bandwidth of 0.0891
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Warning: Removed 767 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Picking joint bandwidth of 0.0308
## Picking joint bandwidth of 0.0891
for (i in c(3,20)){
load(paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_5samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData"))
myplots(df2 = get(paste0("results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1")), N=i)
}
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Picking joint bandwidth of 0.0435
## Picking joint bandwidth of 0.0891
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Warning: Removed 767 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Picking joint bandwidth of 0.0308
## Picking joint bandwidth of 0.0891
# Initialize result data frame
auc_results <- data.frame(NbrDataset = integer(), AUC = numeric(), NbrSamples = integer())
# Loop through number of samples
for (s in 3:5) {
for (i in 3:30) {
# Load result
file_path <- paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_", s, "samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData")
if (!file.exists(file_path)) next
load(file_path)
df2 <- get(paste0("results_MariasarraysREDUCED_", s, "samples_", i, "datasets_6906CpGs_0_8p0_0_65p1"))
## Which proportion of the CpG tested are covered?
propCovered <- table(is.na(df2))[1]/nrow(df2)
# Build mimic dataframe with label
df_mimic <- df2 %>%
data.frame() %>%
tibble::rownames_to_column("CpGname") %>%
mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, 1, 0))
# Identify alpha column
score_col <- if ("alpha" %in% colnames(df_mimic)) "alpha" else colnames(df_mimic)[2]
# Compute ROC and AUC
roc_obj <- roc(response = df_mimic$ishvCpG, predictor = df_mimic[[score_col]])
auc_val <- as.numeric(auc(roc_obj))
# Store result
auc_results <- bind_rows(auc_results, tibble(
NbrDataset = i,
AUC = auc_val,
NbrSamples = s,
propCovered = propCovered
))
}
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## For full dataset
# Loop through number of samples
for (i in 3:30) {
# Load result
file_path <- paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_allsamples_",
i, "datasets_6906CpGs_0_8p0_0_65p1.RData")
if (!file.exists(file_path)) next
load(file_path)
df2 <- get(paste0("results_MariasarraysREDUCED_allsamples_", i, "datasets_6906CpGs_0_8p0_0_65p1"))
## Which proportion of the CpG tested are covered?
propCovered <- table(is.na(df2))[1]/nrow(df2)
# Build mimic dataframe with label
df_mimic <- df2 %>%
data.frame() %>%
tibble::rownames_to_column("CpGname") %>%
mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, 1, 0))
# Identify alpha column
score_col <- if ("alpha" %in% colnames(df_mimic)) "alpha" else colnames(df_mimic)[2]
# Compute ROC and AUC
roc_obj <- roc(response = df_mimic$ishvCpG, predictor = df_mimic[[score_col]])
auc_val <- as.numeric(auc(roc_obj))
# Store result
auc_results <- bind_rows(auc_results, tibble(
NbrDataset = i,
AUC = auc_val,
NbrSamples = 15,
propCovered = propCovered
))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_results$NbrSamples[auc_results$NbrSamples %in% 15] <- "all"
ggplot(auc_results, aes(x = NbrDataset, y = AUC, color = factor(NbrSamples), group = NbrSamples)) +
geom_hline(linetype = 5, yintercept = 0.9) +
geom_line(size = 1.1) +
geom_point(data = auc_results, aes(size = propCovered)) +
scale_color_brewer(palette = "Set1") +
labs(
x = "Number of datasets",
y = "AUC",
color = "Samples\nper dataset",
) +
guides(
color = guide_legend(order = 1),
size = guide_legend(order = 2)
) +
theme_minimal(base_size = 14) +
theme(
legend.position = c(0.95, 0.05),
legend.justification = c("right", "bottom"),
legend.box = "horizontal",
legend.background = element_rect(fill = "white", color = "black", size = 0.4),
legend.key = element_rect(fill = "white", color = NA),
plot.margin = margin(5.5, 20, 5.5, 5.5, "pt"),
panel.clip = "off"
) +
coord_cartesian(clip = "off") +
scale_x_continuous(breaks = 3:30)+
scale_size(
range = c(0.1, 7),
breaks = c(0.2, 0.4, 0.6, 0.8), # Adjust depending on your data
name = "Proportion of\nCpGs covered"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in plot_theme(plot): The `panel.clip` theme element is not defined in
## the element hierarchy.
auc_results <- data.frame(NbrDataset = integer(), AUC = numeric(), NbrSamples = integer())
i = 16
# Load result
file_path <- paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_allsamples_",
i, "datasets_6906CpGs_0_8p0_0_65p1.RData")
if (!file.exists(file_path)) next
load(file_path)
df2 <- get(paste0("results_MariasarraysREDUCED_allsamples_", i, "datasets_6906CpGs_0_8p0_0_65p1"))
## Which proportion of the CpG tested are covered?
propCovered <- (table(is.na(df2))/(sum(table(is.na(df2)))))[[1]]
# Build mimic dataframe with label
df_mimic <- df2 %>%
data.frame() %>%
tibble::rownames_to_column("CpGname") %>%
mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, 1, 0))
# Identify alpha column
score_col <- if ("alpha" %in% colnames(df_mimic)) "alpha" else colnames(df_mimic)[2]
# Compute ROC and AUC
roc_obj <- roc(response = df_mimic$ishvCpG, predictor = df_mimic[[score_col]])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_val <- as.numeric(auc(roc_obj))
# Store result
auc_results <- bind_rows(auc_results, tibble(
NbrDataset = i,
AUC = auc_val,
NbrSamples = 15
))
auc_results
## NbrDataset AUC NbrSamples
## 1 16 0.9790176 15
lambdas are defined based on a 5% threshold, even if alpha is not. How to not hardcode them? MCMC on lambda? Link with alpha?